************************************************************************************************              
**** This do.file builds outcome measures across the product space (at the "national level")
************************************************************************************************               


global db2 "D:\Dropbox\unequal_gains\QJE revision plan\analysis"
global db "D:\Dropbox\unequal_gains\main_data"

************************************************************
** 0. Merge GS1 and UPC data 
************************************************************

/* i) Prepare GS1 data
clear
import delimited "/export/home/doctoral/xjaravel/Unequal Innovation/nielsen_extracts/GS1 Data/gs1_feb2016.csv", stringcols(1 2 3)
replace entitygln=gs1companyprefix if missing(entitygln)
keep entitygln upccompanyprefix
drop if missing(upccompanyprefix)
* check that the upccompanyprefix is unique (indeed it is)
bysort upccompanyprefix: gen dup=[_n]
drop if dup>1
drop dup
save "/export/home/doctoral/xjaravel/Unequal Innovation/nielsen_extracts/GS1 Data/manufacturer_id.dta", replace

* ii) Merge to UPC data
foreach z of numlist 2004(1)2015 {
clear
import delimited "/export/home/doctoral/xjaravel/Unequal Innovation/nielsen_extracts/HMS/`z'/Annual_Files/purchases_`z'.tsv", stringcols(2) 
* keep numeric identifier so that we can merge to our other data
gen double upc_num=real(upc)
* get rid of leading "system number"
gen upc_nosys=substr(upc,2,length(upc))
keep upc_nosys upc_num 
duplicates drop 

foreach i of numlist 6(1)10{
gen upccompanyprefix_`i'=substr(upc_nosys,1,`i')
}
* we do multiple merges, under different assumptions about the number of digits in the manufacturer prefix
foreach i of numlist 6(1)10{
rename upccompanyprefix_`i' upccompanyprefix
merge m:1 upccompanyprefix using "/export/home/doctoral/xjaravel/Unequal Innovation/nielsen_extracts/GS1 Data/manufacturer_id.dta"
drop if _merge==2
rename _merge _merge_`i'
rename entitygln entitygln`i'
rename upccompanyprefix upccompanyprefix_`i'
}

gen manufacturer=entitygln6
foreach i in 7 8 9 10 {
replace manufacturer=entitygln`i' if missing(manufacturer)
}
gen missing_manufacturer=0
replace missing_manufacturer=1 if missing(manufacturer)
tab missing
keep  upc_num upc_nosys manufacturer missing_manufacturer
save "$db/Important Datasets/manufacturers_`z'.dta", replace
}
*/

***********************************************************************
** We build structural measures of markup level & change in markups ***
***********************************************************************

** A. Compute markup levels based on manufacturer shares (in RMS data)
cd "$db2\RMS\collapsed_files\state_level"

foreach i of numlist 2006(1)2015 {

use "$db/Important Datasets/manufacturers_`i'.dta", clear
rename upc_num upc
merge 1:m upc using rms_`i'
keep if _merge==3
drop _merge
drop if missing_manufacturer==1
drop missing_manufacturer

* build "quality adjusted" price deciles
* to do so, bring in size information and get rid of outliers
gen upc_ver_uc=1
merge m:1 upc upc_ver_uc using "$db/Important Lists/upc_brand_size_final.dta"
keep if _merge==3
drop _merge
sum size1_amount, d
drop if size1_amount<r(p1) | size1_amount>r(p99)
gen average_unit_price_adj=average_unit_price/(size1_amount*multi)
* create quality ranks without weights
sort product_module_code average_unit_price_adj
bysort product_module_code: gen double temp=[_N]
bysort product_module_code: gen double temp2=[_n]
gen rank=temp2/temp*100
gen quality_rank=.
foreach r of numlist 1(1)10 {
replace quality_rank=`r' if rank<=`r'*10 & missing(quality_rank)
}
bysort product_module_code quality_rank: egen double totspend_l_base = sum(total_spending)

* now compute manufacturer shares & implied markups at various levels 

** With Betrand competition formula from Hottman et al. (2016)

* within modules by quality deciles
bysort product_module_code quality_rank manufacturer: egen double totspend_l_manuf = sum(total_spending)
gen implied_e_module_quality = 4 - (4-1)*totspend_l_manuf/totspend_l_base
gen implied_markup_module_quality = implied_e_module_quality/(implied_e_module_quality-1)
sum implied_markup_module_quality, d
sum implied_markup_module_quality [aw=totspend_l_manuf], d

* within modules 
bysort product_module_code manufacturer: egen double totspend_l_manuf_module = sum(total_spending)
bysort product_module_code: egen double totspend_l_base_module = sum(total_spending)
gen implied_e_module = 4 - (4-1)*totspend_l_manuf_module/totspend_l_base_module
gen implied_markup_module = implied_e_module/(implied_e_module-1)
sum implied_markup_module, d
sum implied_markup_module [aw=totspend_l_manuf_module], d

* within groups
bysort product_group_code manufacturer: egen double totspend_l_manuf_group = sum(total_spending)
bysort product_group_code: egen double totspend_l_base_group = sum(total_spending)
gen implied_e_group = 4 - (4-1)*totspend_l_manuf_group/totspend_l_base_group
gen implied_markup_group = implied_e_group/(implied_e_group-1)
sum implied_markup_group, d
sum implied_markup_group [aw=totspend_l_manuf_group], d

** With Cournot competition formula from Hottman et al. (2016)

* within modules by quality deciles
gen implied_e_module_quality_C = 1/(1/4-(1/4-1)*totspend_l_manuf/totspend_l_base)
gen implied_markup_module_quality_C = implied_e_module_quality_C/(implied_e_module_quality_C-1)
sum implied_markup_module_quality_C, d
sum implied_markup_module_quality_C [aw=totspend_l_manuf], d

* within modules 
gen implied_e_module_C = 1/(1/4-(1/4-1)*totspend_l_manuf_module/totspend_l_base_module)
gen implied_markup_module_C = implied_e_module_C/(implied_e_module_C-1)
sum implied_markup_module_C, d
sum implied_markup_module_C [aw=totspend_l_manuf_module], d

* within groups
gen implied_e_group_C = 1/(1/4-(1/4-1)*totspend_l_manuf_group/totspend_l_base_group)
gen implied_markup_group_C = implied_e_group_C/(implied_e_group_C-1)
sum implied_markup_group_C, d
sum implied_markup_group_C [aw=totspend_l_manuf_group], d

save "$db/Important Datasets/markups_year`i'_RMS.dta", replace
}

** B. Compute markup changes over time

clear
set obs 1
gen price_index="."
save "$db/Important Price Datasets/RMS_markup_all.dta", replace 

foreach i of numlist 2006(1)2014 {

* load base-year data 
use "$db/Important Datasets/markups_year`i'_RMS.dta", clear
rename total_quantity total_quantity_old
rename average_unit_price_all average_unit_price_all_old

rename implied_markup_module_quality implied_markup_MQ_old
rename implied_markup_module_quality_C implied_markup_MQ_C_old
 
drop total_spending 

* load next-year data 
local var=`i'+1
merge 1:1 upc upc_ver using "$db/Important Datasets/markups_year`var'_RMS.dta"
keep if _merge==3
drop total_spending

gen double price_ratio = average_unit_price_all/average_unit_price_all_old
gen double markup_ratio = implied_markup_module_quality/implied_markup_MQ_old
gen double markup_ratio_C = implied_markup_module_quality_C/implied_markup_MQ_C_old
drop if missing(price_ratio) | missing(markup_ratio)  | missing(markup_ratio_C)

foreach i in price_ratio markup_ratio markup_ratio_C {
sum `i', d 
gen p1_`i'=r(p1)
gen p99_`i'=r(p99)
}

* trim the data as we usually do
foreach i in price_ratio markup_ratio markup_ratio_C {
drop if `i' < p1_`i' | `i'> p99_`i'
}

** compute CES price index
bysort product_module_code quality_rank: egen double total_spending=sum(average_unit_price_all*total_quantity)
bysort product_module_code quality_rank: egen double total_spending_old=sum(average_unit_price_all_old*total_quantity_old)
gen double share=average_unit_price_all*total_quantity/total_spending
gen double share_old=average_unit_price_all_old*total_quantity_old/total_spending_old

gen double weight_num=(share-share_old)/(ln(share)-ln(share_old))
bysort product_module_code quality_rank: egen double weight_den=sum(weight_num)
gen double weight=weight_num/weight_den

gen double numerator_temp2=weight*log(markup_ratio)
bysort product_module_code quality_rank: egen double numerator2=sum(numerator_temp2)
gen double ces_markup_index=exp(numerator2)

gen double numerator_temp3=weight*log(markup_ratio_C)
bysort product_module_code quality_rank: egen double numerator3=sum(numerator_temp3)
gen double ces_markup_C_index=exp(numerator3)

** compute Tornqvist price index
drop weight* numerator*
gen weight=1/2*(share+share_old)

gen double numerator_temp=weight*log(markup_ratio)
bysort product_module_code quality_rank: egen double numerator=sum(numerator_temp)
gen double tornqvist_markup_index=exp(numerator)
drop numerator numerator_temp 

gen double numerator_temp=weight*log(markup_ratio_C)
bysort product_module_code quality_rank: egen double numerator=sum(numerator_temp)
gen double tornqvist_markup_C_index=exp(numerator)
drop numerator numerator_temp

gen avg_total_spending=(total_spending+total_spending_old)/2
keep ces_markup_index ces_markup_C_index tornqvist_markup_index tornqvist_markup_C_index ///
product_module_code quality_rank avg_total_spending totspend_l_base ///
product_group_code department_code
duplicates drop 
gen year=`var'

generate price_index="ces_ideal"
append using "$db/Important Price Datasets/RMS_markup_all.dta"
drop if missing(price_index)
save "$db/Important Price Datasets/RMS_markup_all.dta", replace 
}

* compute average inflation rate by collapsing over all years
use "$db/Important Price Datasets/RMS_markup_all.dta", clear
egen id=group(product_module quality_rank)

bysort id: gen periods=[_N]
tab periods
keep if periods==7 | periods==9

* winsorize the way we always do
foreach i in  ces_markup_index ces_markup_C_index ///
tornqvist_markup_index tornqvist_markup_C_index {
bysort department_code year: egen p5 = pctile(`i'), p(5)
bysort department_code year: egen p95 = pctile(`i'), p(95)
gen `i'_win = `i'
replace `i'_win = p95 if `i'>p95
replace `i'_win = p5 if `i'<p5
drop p5 p95
}

collapse (mean) tornqvist* ces*, by(quality_rank department_code product_group_code product_module_code)
distinct product_module_code

save "$db/Important Price Datasets/RMS_markup_all_final.dta", replace

